import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, FunctionTransformer, StandardScaler
from tensorflow import keras
from scipy.stats import f_oneway, chi2_contingency
import pycaret.classification as pc
import boto3
import chart_studio
import chart_studio.plotly as py
public data: https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset
personal data: spotify API
The objective of this analysis is to get Spotify data from a public dataset and try to construct a classification model to predict the track genre as a function of several features.
There are two datasets: one from the user, got using the Spotify API to retrieve the user's library tracks; the other is a public dataset containing 1.000.000 tracks with mostly the same information as the user data. This analysis is centered in the public data, as it contains thousands of different tracks with equal representation of several genres, so it's a good dataset to train a ML algorithm, but a comparison is made between the two datasets to see how similar or different they are, and check if the model made from public data could be used for the user data.
User and public data contain information of tracks, including title, artist, duration, and some features used internally in Spotify and assigned to each track, such as danceability, acousticness, etc. User data contains some extra features, such as number of sections, tempo changes, and some more.
The meaning of the features can be seen in the API documentation: https://developer.spotify.com/documentation/web-api/reference/get-track
def read_full_table(table_name):
session = boto3.Session(profile_name="default")
dynamodb = session.resource("dynamodb", region_name="eu-west-1")
table = dynamodb.Table(table_name)
response = table.scan()
data = response["Items"]
# Handle pagination
while "LastEvaluatedKey" in response:
response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
data.extend(response["Items"])
return pd.DataFrame(data)
numerical_cols = ["num_sections", "danceability", "sections_avg_duration", "instrumentalness", "liveness", "loudness",
"duration", "speechiness", "valence", "dynamics_changes", "tempo_changes", "acousticness",
"time_signature_changes", "popularity", "mode_changes", "energy", "key_changes", "tempo"]
numerical_cols_public = ["danceability", "instrumentalness", "liveness", "loudness", "duration", "speechiness",
"valence", "acousticness", "popularity", "energy", "tempo"]
non_standarized_cols = ["num_sections", "sections_avg_duration", "loudness", "duration", "dynamics_changes",
"tempo_changes", "time_signature_changes", "popularity", "mode_changes", "key_changes", "tempo"]
categorical_cols = ["key", "mode"]
notes = ("C", "C#", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B")
key_mapping = {i:note for i, note in enumerate(notes)}
key_mapping[-1] = "NoKey"
mode_mapping = {0: "Minor", 1: "Major"}
random_state = 602452
ssm = boto3.client("ssm", region_name="eu-west-1")
chart_studio_api_key = ssm.get_parameter(Name="CHART_STUDIO_API_KEY", WithDecryption=True)
chart_studio_api_key = chart_studio_api_key["Parameter"]["Value"]
chart_studio.tools.set_credentials_file(username='jcf94', api_key=chart_studio_api_key)
track_info_raw = read_full_table("track_info")
public_data_raw = pd.read_csv(r"C:\Users\jcf\Desktop\codigo\Portfolio\Spotify Analysis\public_music_data.csv")
track_info = track_info_raw.copy()
public_data = public_data_raw.copy()
track_info["key"] = track_info["key"].map(key_mapping)
track_info["mode"] = track_info["mode"].map(mode_mapping)
public_data["key"] = public_data["key"].map(key_mapping)
public_data["mode"] = public_data["mode"].map(mode_mapping)
for col in numerical_cols:
track_info[col] = pd.to_numeric(track_info[col])
for col in track_info.columns:
if "changes" in col:
track_info[col] = track_info[col] / track_info["duration"]
track_info["case"] = "user"
public_data["duration"] = public_data["duration_ms"] / 1000
public_data["case"] = "public"
genre_info = track_info.explode("genres")
genre_info = genre_info.loc[:, ["genres", "track_id", "artist"]]
genre_info = genre_info.groupby(["genres"]).nunique().reset_index()
genre_info["track_perc"] = 100 * genre_info["track_id"] / genre_info["track_id"].sum()
genre_info["artist_perc"] = 100 * genre_info["artist"] / genre_info["artist"].sum()
genre_info_public = public_data.loc[:, ["track_genre", "track_id", "artists"]]
genre_info_public = genre_info_public.groupby(["track_genre"]).nunique().reset_index()
genre_info_public["track_perc"] = 100 * genre_info_public["track_id"] / genre_info_public["track_id"].sum()
genre_info_public["artist_perc"] = 100 * genre_info_public["artists"] / genre_info_public["artists"].sum()
First, an Exploratory Data Analysis is done, to see how the possible patterns in the distributions of the different features. Both public and user data are shown in the histograms comparing features.
df_regression_eda = pd.concat([track_info, public_data])
df_regression_eda = df_regression_eda[categorical_cols + numerical_cols + ["case"]]
df_regression_eda.to_csv("df_regression_eda.csv")
for col in categorical_cols:
df = pd.concat([track_info, public_data]).loc[:, ["case", col]]
df = df.sort_values(by=col)
fig = go.Figure()
fig.add_trace(go.Histogram(x=df.loc[df["case"] == "user", col], histnorm="percent", name="User"))
fig.add_trace(go.Histogram(x=df.loc[df["case"] == "public", col], histnorm="percent", name="Public"))
fig.update_layout(
barmode="overlay",
xaxis_title_text=col,
yaxis_title_text='%'
)
fig.update_traces(opacity=0.75)
fig.show()
User seems to listen to more songs in A, D and E, whereas the public data has more in Ab, Bb and C. The rest of keys seem more or less equally distirbuted.
User also listens to more Minor songs.
The differences aren't specially notable or skewed, so both datasets could be compared in terms of these categorical features.
for col in numerical_cols:
if col in track_info.columns and col in public_data.columns:
df = pd.concat([track_info, public_data]).loc[:, ["case", col]]
fig = go.Figure()
bin_start = df[col].min()
bin_end = df[col].max()
bin_size = abs(bin_end - bin_start) / 20
fig.add_trace(go.Histogram(x=df.loc[df["case"] == "user", col], histnorm="percent", name="User", autobinx=False, xbins=dict(
start=bin_start,
end=bin_end,
size=bin_size
)))
fig.add_trace(go.Histogram(x=df.loc[df["case"] == "public", col], histnorm="percent", name="Public", autobinx=False, xbins=dict(
start=bin_start,
end=bin_end,
size=bin_size
)))
fig.update_layout(
barmode="overlay",
xaxis_title_text=col,
yaxis_title_text='%'
)
fig.update_traces(opacity=0.75)
fig.show()
Some numerical features show similarity between public and user data, and others have notable different distributions:
- Danceability: both datasets have similar distributions but with some translation. Public data is more "danceable" than user data.
- Instrumentalness: public data is way less instrumental than user data, with a more notable peak in the first bin.
- Liveness: similar distribution.
- Loudness: similar distribution, but public data has more left tail (public songs are less loud).
- Duration: public data has considerable less duration than user data, with most of the public tracks averaging at around 4 min.
- Speechiness: similar distributions, but public data has more tracks with low value.
- Valence: distributions are somewhat similar, but have different peaks. Public data is more homogenous, and user data is more dense in low values.
- Acousticness: user data is more skewed, having more density in low values, specially at values near 0. Public data has also a peak here, but not as high, and is more flatly distirbuted after this bin.
- Popularity: distributions are somewhat similar, but some peaks are shifted between distributions.
- Energy: user data has more tracks with high energy, but the trend is similar.
- Tempo: similar distributions, centered around 120-130 BPM.
First, a naive feature selection is made, using an ANOVA test for the numerical features and the target (track genre), and a Chi-square test for the categorial variables (which have been one-hot encoded before).
df = public_data.copy()
one_hot_encoder = OneHotEncoder()
encoded_features = one_hot_encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features.todense(), columns=one_hot_encoder.get_feature_names_out(categorical_cols))
df_encoded = pd.concat([df.drop(categorical_cols, axis=1), encoded_df], axis=1)
label_encoder = LabelEncoder()
df_encoded["track_genre"] = label_encoder.fit_transform(df_encoded['track_genre'])
numerical_features = numerical_cols_public
output = df_encoded['track_genre']
for feature in numerical_features:
groups = [df_encoded[feature][df_encoded['track_genre'] == category] for category in df_encoded['track_genre'].unique()]
f_val, p_val = f_oneway(*groups)
print(f"ANOVA for {feature}: F-value = {f_val:.3f}, p-value = {p_val:.3f}")
categorical_features = encoded_df.columns
for feature in categorical_features:
contingency_table = pd.crosstab(df_encoded[feature], output)
chi2, p_value, _, _ = chi2_contingency(contingency_table)
print(f"Chi-square test between {feature} and output: chi2 = {chi2:.3f}, p-value = {p_value:.3f}")
ANOVA for danceability: F-value = 714.110, p-value = 0.000 ANOVA for instrumentalness: F-value = 832.807, p-value = 0.000 ANOVA for liveness: F-value = 180.186, p-value = 0.000 ANOVA for loudness: F-value = 828.209, p-value = 0.000 ANOVA for duration: F-value = 198.542, p-value = 0.000 ANOVA for speechiness: F-value = 797.292, p-value = 0.000 ANOVA for valence: F-value = 441.163, p-value = 0.000 ANOVA for acousticness: F-value = 960.142, p-value = 0.000 ANOVA for popularity: F-value = 343.462, p-value = 0.000 ANOVA for energy: F-value = 842.844, p-value = 0.000 ANOVA for tempo: F-value = 93.752, p-value = 0.000 Chi-square test between key_A and output: chi2 = 750.010, p-value = 0.000 Chi-square test between key_Ab and output: chi2 = 782.307, p-value = 0.000 Chi-square test between key_B and output: chi2 = 732.271, p-value = 0.000 Chi-square test between key_Bb and output: chi2 = 670.181, p-value = 0.000 Chi-square test between key_C and output: chi2 = 919.286, p-value = 0.000 Chi-square test between key_C# and output: chi2 = 1678.495, p-value = 0.000 Chi-square test between key_D and output: chi2 = 1081.172, p-value = 0.000 Chi-square test between key_E and output: chi2 = 684.697, p-value = 0.000 Chi-square test between key_Eb and output: chi2 = 711.869, p-value = 0.000 Chi-square test between key_F and output: chi2 = 573.680, p-value = 0.000 Chi-square test between key_F# and output: chi2 = 837.132, p-value = 0.000 Chi-square test between key_G and output: chi2 = 639.420, p-value = 0.000 Chi-square test between mode_Major and output: chi2 = 7230.645, p-value = 0.000 Chi-square test between mode_Minor and output: chi2 = 7230.645, p-value = 0.000
According to these results, all features are important in the target if considered one by one. In the following section, correlations are plotted to check if there are correlated features which can be simplified.
correlations_public = public_data[numerical_cols_public].corr().abs()
fig = px.imshow(correlations_public, text_auto=True)
fig.update_layout(width=1100, height=1000, autosize=False)
fig.show()
py.plot(fig, filename="correlations_classification_raw", auto_open=False)
'https://plotly.com/~jcf94/15/'
From this plot, it can be seen that there are some features which have considerable correlation between them. For each interaction, a new feature is constructed multiplying the features, and then these are discarded.
track_info_regression = public_data.copy()
useless_cols = ["loudness", "energy", "acousticness", "valence", "danceability", "instrumentalness"]
combinations_cols = [("valence", "danceability"), ("energy", "acousticness"), ("energy", "loudness"), ("instrumentalness", "loudness"), ("instrumentalness", "valence")]
numerical_cols_extra = list(set(numerical_cols_public).difference(useless_cols))
for pair in combinations_cols:
track_info_regression[f"{pair[0]} - {pair[1]}"] = track_info_regression[pair[0]] * track_info_regression[pair[1]]
numerical_cols_extra.append(f"{pair[0]} - {pair[1]}")
regression_cols = numerical_cols_extra + categorical_cols
correlations_public = track_info_regression[numerical_cols_extra].corr().abs()
fig = px.imshow(correlations_public, text_auto=True)
fig.update_layout(width=1100, height=800, autosize=False)
fig.show()
Some constructed features show correlation between them (the combination of instrumentalness and loudness, and instrumentalness and valence). A new feature is constructed using the 3 instead of pairwise combination.
track_info_regression = public_data.copy()
useless_cols = ["loudness", "energy", "acousticness", "valence", "danceability", "instrumentalness"]
combinations_cols = [("valence", "danceability"), ("energy", "acousticness"), ("energy", "loudness")]
numerical_cols_extra = list(set(numerical_cols_public).difference(useless_cols))
for pair in combinations_cols:
track_info_regression[f"{pair[0]} - {pair[1]}"] = track_info_regression[pair[0]] * track_info_regression[pair[1]]
numerical_cols_extra.append(f"{pair[0]} - {pair[1]}")
track_info_regression["instrumentalness - loudness - valence"] = track_info_regression["instrumentalness"] * track_info_regression["loudness"] * track_info_regression["valence"]
numerical_cols_extra.append("instrumentalness - loudness - valence")
regression_cols = numerical_cols_extra + categorical_cols
correlations_public = track_info_regression[numerical_cols_extra].corr().abs()
fig = px.imshow(correlations_public, text_auto=True)
fig.update_layout(width=1100, height=800, autosize=False)
fig.show()
py.plot(fig, filename="correlations_classification_final", auto_open=False)
'https://plotly.com/~jcf94/17/'
The final correlation matrix shows that the features selected don't have significant correlation between them, so they can be good candidates for the classification model.
In the next section, instead of trying a naïve selection, a FeatureSelection model from sklearn is used, to check if the results are different. This model should be more powerful in selecting features, as instead of just seeing correlations, a simple model is passed and the features are select checking if the model improves or gets worse. A simple RidgeClassifier is used, as this model penalizes features which are not important in prediction.
track_info_regression = public_data.copy()
combinations_cols = [("valence", "danceability"), ("energy", "acousticness"), ("energy", "loudness")]
numerical_cols_fs = numerical_cols_public[:]
for pair in combinations_cols:
track_info_regression[f"{pair[0]} - {pair[1]}"] = track_info_regression[pair[0]] * track_info_regression[pair[1]]
numerical_cols_fs.append(f"{pair[0]} - {pair[1]}")
track_info_regression["instrumentalness - loudness - valence"] = track_info_regression["instrumentalness"] * track_info_regression["loudness"] * track_info_regression["valence"]
numerical_cols_fs.append("instrumentalness - loudness - valence")
track_info_regression = track_info_regression[numerical_cols_fs + categorical_cols + ["track_genre"]]
one_hot_encoder = OneHotEncoder()
encoded_features = one_hot_encoder.fit_transform(track_info_regression[categorical_cols])
categorical_cols_encoded = one_hot_encoder.get_feature_names_out(categorical_cols)
regression_cols = numerical_cols_fs + categorical_cols_encoded.tolist()
encoded_df = pd.DataFrame(encoded_features.todense(), columns=categorical_cols_encoded)
df_encoded = pd.concat([track_info_regression.drop(categorical_cols, axis=1), encoded_df], axis=1)
X = df_encoded[regression_cols]
Y = df_encoded["track_genre"]
ridge = RidgeClassifierCV(alphas=np.logspace(-6, 6, num=5))
sfs = SequentialFeatureSelector(ridge, tol=0.001).fit(X, Y)
list(zip(sfs.get_support(), X.columns))
[(True, 'danceability'), (True, 'instrumentalness'), (False, 'liveness'), (True, 'loudness'), (True, 'duration'), (True, 'speechiness'), (True, 'valence'), (True, 'acousticness'), (True, 'popularity'), (True, 'energy'), (True, 'tempo'), (False, 'valence - danceability'), (False, 'energy - acousticness'), (True, 'energy - loudness'), (False, 'instrumentalness - loudness - valence'), (False, 'key_A'), (False, 'key_Ab'), (False, 'key_B'), (False, 'key_Bb'), (False, 'key_C'), (False, 'key_C#'), (False, 'key_D'), (False, 'key_E'), (False, 'key_Eb'), (False, 'key_F'), (False, 'key_F#'), (False, 'key_G'), (True, 'mode_Major'), (False, 'mode_Minor')]
It shows that some of the features using simpler tests are now considered not important in prediction, such as liveness, key, and some of the combined features. This could be the fact that these features and the target are correlated, but one does not help in predicting the other (both variables could be correlated to another, possibly unknown, feature, which could help in prediction if available).
valid_features = []
for valid, feature in list(zip(sfs.get_support(), X.columns)):
if valid:
# print(feature)
valid_features.append(feature)
With the features selected, it's time to select the best model to do classification. In a first phase, the pycaret library is used to compare several models automatically. This library only needs the data, and internally it does preprocessing, splits data in train, test and validation, fitting several models using cross-validation, and calculating several metrics, such as accuracy and time spent in fitting.
One inconvinient of this library is that it needs a considerable amount of computing power and memory. In a first step, only a subset of the full data is used to speed up the overall process. The data is selected homogenously, with 1/4 of the total amount of tracks for each genre (this way, 1/4 of the data is selected randomly with an equal representation of each genre). Even with reduced data, the process takes almost 30 min to complete.
data = track_info_regression.groupby("track_genre").sample(n=250, random_state=random_state)
data['track_genre'] = data['track_genre'].astype('category')
valid_features_pycaret = [f for f in valid_features if "key" not in f and "mode" not in f]
valid_features_pycaret = valid_features_pycaret + ["mode", "key"]
data = data[valid_features_pycaret + ["track_genre"]]
start_time = time.time()
clf1 = pc.setup(data, target='track_genre', verbose=False, session_id=random_state)
best_model = pc.compare_models()
print(best_model)
end_time = time.time()
print(f"execution time: {end_time - start_time}")
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| rf | Random Forest Classifier | 0.2982 | 0.8816 | 0.2982 | 0.2847 | 0.2830 | 0.2920 | 0.2922 | 7.7230 |
| xgboost | Extreme Gradient Boosting | 0.2943 | 0.9175 | 0.2943 | 0.2952 | 0.2905 | 0.2881 | 0.2882 | 9.4910 |
| gbc | Gradient Boosting Classifier | 0.2655 | 0.0000 | 0.2655 | 0.2644 | 0.2597 | 0.2590 | 0.2591 | 97.4090 |
| et | Extra Trees Classifier | 0.2655 | 0.8394 | 0.2655 | 0.2493 | 0.2503 | 0.2590 | 0.2592 | 13.3630 |
| lightgbm | Light Gradient Boosting Machine | 0.2546 | 0.8568 | 0.2546 | 0.2666 | 0.2561 | 0.2480 | 0.2482 | 15.5430 |
| dt | Decision Tree Classifier | 0.1916 | 0.5996 | 0.1916 | 0.1952 | 0.1906 | 0.1844 | 0.1845 | 0.2100 |
| lda | Linear Discriminant Analysis | 0.1674 | 0.0000 | 0.1674 | 0.1440 | 0.1426 | 0.1601 | 0.1604 | 0.0710 |
| lr | Logistic Regression | 0.1537 | 0.0000 | 0.1537 | 0.1213 | 0.1236 | 0.1462 | 0.1466 | 11.8690 |
| nb | Naive Bayes | 0.1374 | 0.8297 | 0.1374 | 0.1539 | 0.1185 | 0.1298 | 0.1322 | 0.1280 |
| ridge | Ridge Classifier | 0.1228 | 0.0000 | 0.1228 | 0.0783 | 0.0730 | 0.1150 | 0.1166 | 0.0590 |
| knn | K Neighbors Classifier | 0.1078 | 0.6303 | 0.1078 | 0.1237 | 0.1043 | 0.0999 | 0.1002 | 0.5120 |
| qda | Quadratic Discriminant Analysis | 0.0715 | 0.0000 | 0.0715 | 0.1199 | 0.0718 | 0.0633 | 0.0703 | 0.1210 |
| ada | Ada Boost Classifier | 0.0690 | 0.0000 | 0.0690 | 0.0392 | 0.0326 | 0.0608 | 0.0644 | 1.4120 |
| svm | SVM - Linear Kernel | 0.0265 | 0.0000 | 0.0265 | 0.0186 | 0.0115 | 0.0179 | 0.0219 | 2.7820 |
| dummy | Dummy Classifier | 0.0085 | 0.5000 | 0.0085 | 0.0001 | 0.0001 | 0.0000 | 0.0000 | 0.0580 |
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='sqrt',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
monotonic_cst=None, n_estimators=100, n_jobs=-1,
oob_score=False, random_state=602452, verbose=0,
warm_start=False)
execution time: 1620.0661885738373
The best models with respect accuracy are Random Forest Classifier, XGBoost and Gradient Boosting Classifier. These 3 models are compared using the full dataset.
data = track_info_regression.copy()
data['track_genre'] = data['track_genre'].astype('category')
valid_features_pycaret = [f for f in valid_features if "key" not in f and "mode" not in f]
valid_features_pycaret = valid_features_pycaret + ["mode", "key"]
data = data[valid_features_pycaret + ["track_genre"]]
start_time = time.time()
clf1 = pc.setup(data, target='track_genre', verbose=False, session_id=random_state)
best_models = pc.compare_models(n_select=3, cross_validation=False, include = ["et", 'rf', 'xgboost'])
print(best_models)
end_time = time.time()
print(f"execution time: {end_time - start_time}")
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| rf | Random Forest Classifier | 0.3191 | 0.8874 | 0.3191 | 0.3081 | 0.3095 | 0.3131 | 0.3131 | 14.1800 |
| xgboost | Extreme Gradient Boosting | 0.3075 | 0.9435 | 0.3075 | 0.3087 | 0.3069 | 0.3014 | 0.3014 | 52.8900 |
| et | Extra Trees Classifier | 0.2993 | 0.8054 | 0.2993 | 0.2845 | 0.2873 | 0.2931 | 0.2932 | 12.1100 |
[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='sqrt',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
monotonic_cst=None, n_estimators=100, n_jobs=-1,
oob_score=False, random_state=602452, verbose=0,
warm_start=False), XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device='cpu', early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=-1,
num_parallel_tree=None, objective='multi:softprob', ...), ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='sqrt',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
monotonic_cst=None, n_estimators=100, n_jobs=-1,
oob_score=False, random_state=602452, verbose=0,
warm_start=False)]
execution time: 233.26131105422974
The Random Forest Classifier is the best model overall, with a 31.9% of precission. This value is not very high, but taking into account the total amount of classes (114), it's not an easy problem. The execution time is almost 4 min, much less than when using all the models.
The following plot shows the SHAP values for each feature. The plot shows similarities with the feature selection process: we can see that the key has a very low importance for predicting. If we included all the features, we could see that the features removed would have had a low importance also.
pc.plot_model(best_models[0], plot = 'feature_all')
Finally, the error is calculated for each class (genre), to see which performed better and worse. Pycaret includes methods to calculate predictions with the test dataset.
preds = pc.predict_model(best_models[0])
preds["success"] = preds["track_genre"].eq(preds["prediction_label"])
success = preds.loc[:, ["track_genre", "success"]].groupby(["track_genre"]).agg(
total=("track_genre", "count"),
success=("success", "sum")).reset_index()
success["error"] = 100 * (1 - success["success"] / success["total"])
total_error = 100 * (1 - preds["success"].sum() / preds["success"].count())
print(f"{total_error=}%")
success.sort_values(by="error")
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|---|
| 0 | Random Forest Classifier | 0.3191 | 0.8874 | 0.3191 | 0.3081 | 0.3095 | 0.3131 | 0.3131 |
total_error=68.0906432748538%
| track_genre | total | success | error | |
|---|---|---|---|---|
| 42 | grindcore | 300 | 271 | 9.666667 |
| 93 | romance | 300 | 255 | 15.000000 |
| 18 | comedy | 300 | 255 | 15.000000 |
| 105 | study | 300 | 255 | 15.000000 |
| 101 | sleep | 300 | 248 | 17.333333 |
| ... | ... | ... | ... | ... |
| 9 | brazil | 300 | 15 | 95.000000 |
| 2 | alt-rock | 300 | 10 | 96.666667 |
| 32 | electronic | 300 | 9 | 97.000000 |
| 99 | singer-songwriter | 300 | 8 | 97.333333 |
| 56 | indie | 300 | 7 | 97.666667 |
114 rows × 4 columns
Grindcore, romance, comedy, study and sleep are the best predicted genres, with an error <= 15%. On the contrary, indie, singer-songwriter, electronic, alt-rock and brazil are the worst predicted genres, with more than 95% of error.
The confusion matrix is calculated, but not shown, as it's not an easily readable plot, with over a hundred different classes. The result is stored in csv format to see with detail outside this notebook.
confusion = metrics.confusion_matrix(preds["track_genre"], preds["prediction_label"], labels=preds["track_genre"].unique())
confusion = pd.DataFrame(confusion, index=preds["track_genre"].unique(), columns=preds["track_genre"].unique())
confusion.to_csv("confusion_pycaret.csv")
In the second phase, we train the same Random Forest Classifier as pycaret, but using the model from sklearn directly, to compare execution times. As we can access the model inside the pycaret setup and objects, we can just copy the hyperparameters and all initialization variables to construct the model, skipping the tuning phase.
X = track_info_regression.copy()
X = pd.get_dummies(X, columns=categorical_cols)
X = X[valid_features]
Y = track_info_regression["track_genre"]
label_encoder = OneHotEncoder()
Y_encoded = label_encoder.fit_transform(pd.DataFrame(Y)).todense()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=random_state)
start_time = time.time()
model_rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='sqrt',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
monotonic_cst=None, n_estimators=100, n_jobs=-1,
oob_score=False, random_state=random_state, verbose=0,
warm_start=False)
model_rf.fit(X_train, y_train)
end_time = time.time()
print(f"execution time: {end_time - start_time}")
execution time: 13.301671028137207
The fitting takes only 13 seconds, much faster than using pycaret, but we are constructing the model directly using information from the previous process, so it can't be really compared. Nevertheless, having total control of the model implementation can be a nice to have feature. In experience, pycaret not only uses large amounts of computing and memory power, but the models can be somewhat problematic when saving and trying to persist them isolated. The setup function must be executed, and saved if desired, before the models can be constructed and accessed. Some problems where encountered when saving the pycaret models but not the setup. In this case, the whole experiment could take several GB of memory when saving in disk.
As before, the errors are calcualted for each class. We can see that the results are somewhat different, but with little difference, and this could be because the train and test splitting are not the same as in pycaret experiment. The results are plotted for each genre, sorted by error, in order to see if there is a trend or some visible anomaly.
y_pred = model_rf.predict(X_test)
preds_rf = pd.concat([X_test, y_test], axis=1)
preds_rf["prediction_label"] = y_pred
preds_rf["success"] = preds_rf["track_genre"].eq(preds_rf["prediction_label"])
success_rf = preds_rf.loc[:, ["track_genre", "success"]].groupby(["track_genre"]).agg(
total=("track_genre", "count"),
success=("success", "sum")).reset_index()
success_rf["error"] = 100 * (1 - success_rf["success"] / success_rf["total"])
total_error = 100 * (1 - success_rf["success"].sum() / success_rf["total"].sum())
print(f"{total_error=}%")
display(success_rf.sort_values(by="error"))
fig = px.bar(success_rf.sort_values(by="error"), x="track_genre", y="error")
fig.show()
total_error=67.51754385964912%
| track_genre | total | success | error | |
|---|---|---|---|---|
| 42 | grindcore | 226 | 203 | 10.176991 |
| 101 | sleep | 199 | 178 | 10.552764 |
| 93 | romance | 198 | 171 | 13.636364 |
| 105 | study | 208 | 178 | 14.423077 |
| 18 | comedy | 208 | 178 | 14.423077 |
| ... | ... | ... | ... | ... |
| 2 | alt-rock | 200 | 8 | 96.000000 |
| 32 | electronic | 182 | 7 | 96.153846 |
| 89 | reggaeton | 199 | 7 | 96.482412 |
| 85 | punk | 212 | 6 | 97.169811 |
| 102 | songwriter | 215 | 4 | 98.139535 |
114 rows × 4 columns
The first few genres with low error seem to stay low, and then jumps from 14% to almost 20%. Starting from 30%, the trends stays more or less linear.
In the following section, a neural network model is constructed, as seen in the following article:
The problem posed in the article is the same as here (multiclass classification for spotify data), so it's good to see if the results shown are similar to the one in the previous section. We can compare a different type of model and see if the performance gets better, worse or stays the same.
The same NN architecture will be used.
For this model, some preprocessing needs to be done. In particular, enconding and scaling are done using scikit pipelines.
data = track_info_regression[valid_features_pycaret + ["track_genre"]]
X = data[valid_features_pycaret]
y = data["track_genre"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
ordinal_encoder = OrdinalEncoder(handle_unknown='error')
one_hot_encoder = OneHotEncoder(handle_unknown='error', sparse_output=False)
scaler = StandardScaler()
scale_columns = X.columns.difference(["mode", "key"])
preprocessor = ColumnTransformer(transformers = [('ordinal', ordinal_encoder, ["mode"]),
('onehot', one_hot_encoder, ["key"]),
("scaler", scaler, scale_columns)],
remainder="passthrough")
label_encoder = LabelEncoder()
def encode_target(y):
return label_encoder.fit_transform(y)
def decode_target(y):
return label_encoder.inverse_transform(y)
target_encoder = FunctionTransformer(encode_target, validate=False)
pipe = Pipeline(steps=[('preprocessor', preprocessor)])
X_train_scaled = pipe.fit_transform(X_train)
X_test_scaled = pipe.transform(X_test)
one_hot_cols = pipe.named_steps['preprocessor'].named_transformers_['onehot'].get_feature_names_out(["key"])
pipeline_cols = np.concatenate((["mode"], one_hot_cols, scale_columns))
valid_features_nn = [e for e in valid_features if e != "mode_Major"]
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=pipeline_cols)
X_train_scaled_df = X_train_scaled_df[valid_features_nn + ["mode"]]
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=pipeline_cols)
X_test_scaled_df = X_test_scaled_df[valid_features_nn + ["mode"]]
y_train_enc = target_encoder.fit_transform(y_train)
y_test_enc = target_encoder.transform(y_test)
Once preprocessing is done (as well as train and test samples prepared), the NN is constructed using Keras.
early_stopping1 = keras.callbacks.EarlyStopping(monitor = "val_loss",
patience = 10, restore_best_weights = True)
early_stopping2 = keras.callbacks.EarlyStopping(monitor = "val_accuracy",
patience = 10, restore_best_weights = True)
nn_model = keras.Sequential([
keras.layers.Input(name = "input", shape = (X_train_scaled_df.shape[1], )),
keras.layers.Dense(256, activation = "relu"),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.2),
keras.layers.Dense(128, activation = "relu"),
keras.layers.Dense(128, activation = "relu"),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.2),
keras.layers.Dense(64, activation = "relu"),
keras.layers.Dense(max(y_train_enc)+1, activation = "softmax")
])
nn_model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩ │ dense (Dense) │ (None, 256) │ 3,328 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ batch_normalization │ (None, 256) │ 1,024 │ │ (BatchNormalization) │ │ │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dropout (Dropout) │ (None, 256) │ 0 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dense_1 (Dense) │ (None, 128) │ 32,896 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dense_2 (Dense) │ (None, 128) │ 16,512 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ batch_normalization_1 │ (None, 128) │ 512 │ │ (BatchNormalization) │ │ │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dropout_1 (Dropout) │ (None, 128) │ 0 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dense_3 (Dense) │ (None, 64) │ 8,256 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dense_4 (Dense) │ (None, 114) │ 7,410 │ └──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
Total params: 69,938 (273.20 KB)
Trainable params: 69,170 (270.20 KB)
Non-trainable params: 768 (3.00 KB)
start_time = time.time()
nn_model.compile(optimizer = keras.optimizers.Adam(),
loss = "sparse_categorical_crossentropy",
metrics = ["accuracy"])
model_history = nn_model.fit(X_train_scaled_df.values, y_train_enc,
epochs = 100,
verbose = 1, batch_size = 128,
validation_data = (X_test_scaled_df.values, y_test_enc),
callbacks = [early_stopping1, early_stopping2])
end_time = time.time()
print(f"execution time: {end_time - start_time}")
Epoch 1/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 6s 5ms/step - accuracy: 0.1240 - loss: 3.9075 - val_accuracy: 0.2407 - val_loss: 3.0343 Epoch 2/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 4s 5ms/step - accuracy: 0.2237 - loss: 3.0947 - val_accuracy: 0.2671 - val_loss: 2.8688 Epoch 3/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.2452 - loss: 2.9513 - val_accuracy: 0.2745 - val_loss: 2.7969 Epoch 4/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2595 - loss: 2.8691 - val_accuracy: 0.2870 - val_loss: 2.7379 Epoch 5/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.2699 - loss: 2.8212 - val_accuracy: 0.2894 - val_loss: 2.7054 Epoch 6/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2748 - loss: 2.7797 - val_accuracy: 0.3000 - val_loss: 2.6663 Epoch 7/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2831 - loss: 2.7508 - val_accuracy: 0.3008 - val_loss: 2.6410 Epoch 8/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2839 - loss: 2.7223 - val_accuracy: 0.2991 - val_loss: 2.6343 Epoch 9/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.2852 - loss: 2.7082 - val_accuracy: 0.3061 - val_loss: 2.6118 Epoch 10/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2899 - loss: 2.6906 - val_accuracy: 0.3049 - val_loss: 2.6136 Epoch 11/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2902 - loss: 2.6818 - val_accuracy: 0.3038 - val_loss: 2.6069 Epoch 12/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2955 - loss: 2.6630 - val_accuracy: 0.3107 - val_loss: 2.5857 Epoch 13/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2987 - loss: 2.6373 - val_accuracy: 0.3135 - val_loss: 2.5765 Epoch 14/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2975 - loss: 2.6414 - val_accuracy: 0.3136 - val_loss: 2.5753 Epoch 15/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3037 - loss: 2.6204 - val_accuracy: 0.3155 - val_loss: 2.5656 Epoch 16/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.2997 - loss: 2.6288 - val_accuracy: 0.3162 - val_loss: 2.5557 Epoch 17/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3027 - loss: 2.6121 - val_accuracy: 0.3154 - val_loss: 2.5511 Epoch 18/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3089 - loss: 2.5963 - val_accuracy: 0.3175 - val_loss: 2.5509 Epoch 19/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3068 - loss: 2.5935 - val_accuracy: 0.3209 - val_loss: 2.5436 Epoch 20/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3084 - loss: 2.5860 - val_accuracy: 0.3228 - val_loss: 2.5400 Epoch 21/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3101 - loss: 2.5920 - val_accuracy: 0.3230 - val_loss: 2.5297 Epoch 22/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3098 - loss: 2.5806 - val_accuracy: 0.3229 - val_loss: 2.5346 Epoch 23/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 4s 5ms/step - accuracy: 0.3106 - loss: 2.5772 - val_accuracy: 0.3191 - val_loss: 2.5318 Epoch 24/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3094 - loss: 2.5697 - val_accuracy: 0.3231 - val_loss: 2.5248 Epoch 25/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3135 - loss: 2.5510 - val_accuracy: 0.3254 - val_loss: 2.5214 Epoch 26/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3160 - loss: 2.5508 - val_accuracy: 0.3225 - val_loss: 2.5194 Epoch 27/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3129 - loss: 2.5529 - val_accuracy: 0.3274 - val_loss: 2.5032 Epoch 28/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3183 - loss: 2.5467 - val_accuracy: 0.3270 - val_loss: 2.5053 Epoch 29/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3161 - loss: 2.5386 - val_accuracy: 0.3249 - val_loss: 2.5038 Epoch 30/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3171 - loss: 2.5317 - val_accuracy: 0.3244 - val_loss: 2.5060 Epoch 31/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 5s 4ms/step - accuracy: 0.3191 - loss: 2.5298 - val_accuracy: 0.3274 - val_loss: 2.4999 Epoch 32/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3203 - loss: 2.5254 - val_accuracy: 0.3310 - val_loss: 2.4944 Epoch 33/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3190 - loss: 2.5244 - val_accuracy: 0.3287 - val_loss: 2.4970 Epoch 34/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3213 - loss: 2.5177 - val_accuracy: 0.3283 - val_loss: 2.4889 Epoch 35/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3204 - loss: 2.5144 - val_accuracy: 0.3246 - val_loss: 2.4904 Epoch 36/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3207 - loss: 2.5128 - val_accuracy: 0.3289 - val_loss: 2.4892 Epoch 37/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3228 - loss: 2.5095 - val_accuracy: 0.3295 - val_loss: 2.4846 Epoch 38/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3244 - loss: 2.4991 - val_accuracy: 0.3289 - val_loss: 2.4899 Epoch 39/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3223 - loss: 2.5060 - val_accuracy: 0.3307 - val_loss: 2.4849 Epoch 40/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3214 - loss: 2.5037 - val_accuracy: 0.3286 - val_loss: 2.4834 Epoch 41/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3246 - loss: 2.4966 - val_accuracy: 0.3306 - val_loss: 2.4853 Epoch 42/100 713/713 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - accuracy: 0.3248 - loss: 2.4964 - val_accuracy: 0.3297 - val_loss: 2.4780 execution time: 142.8267743587494
print(nn_model.evaluate(X_train_scaled_df.values, y_train_enc))
print(nn_model.evaluate(X_test_scaled_df.values, y_test_enc))
2850/2850 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.3538 - loss: 2.3452 [2.3471789360046387, 0.35471490025520325] 713/713 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - accuracy: 0.3329 - loss: 2.4979 [2.494364023208618, 0.33096492290496826]
With the early stopping callbacks, training around 142 seconds, around x10 times compared to the Random Forest model, but still a reasonable amount of time.
The errors are compared by class, and both NN and RF models are plotted to see a direct comparison where one performs better or worse.
y_pred_enc = nn_model.predict(X_test_scaled_df.values).argmax(axis=1)
y_pred = decode_target(y_pred_enc)
713/713 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
preds_nn = pd.concat([X_test_scaled_df, y_test.reset_index()], axis=1)
preds_nn["prediction_label"] = y_pred
preds_nn["success"] = preds_nn["track_genre"].eq(preds_nn["prediction_label"])
success_nn = preds_nn.loc[:, ["track_genre", "success"]].groupby(["track_genre"]).agg(
total=("track_genre", "count"),
success=("success", "sum")).reset_index()
success_nn["error"] = 100 * (1 - success_nn["success"] / success_nn["total"])
success_nn = success_nn.merge(success_rf, on=["track_genre"], suffixes=(None, "_rf"))
success_nn = success_nn[["track_genre", "total", "success", "success_rf", "error", "error_rf"]]
success_nn = success_nn.sort_values(by="error")
total_error = 100 * (1 - success_nn["success"].sum() / success_nn["total"].sum())
print(f"{total_error=}%")
display(success_nn.sort_values(by="error"))
fig = go.Figure()
fig.add_trace(go.Bar(x=success_nn["track_genre"], y=success_nn["error"], name="NN"))
fig.add_trace(go.Bar(x=success_nn["track_genre"], y=success_nn["error_rf"], name="RF"))
fig.show()
py.plot(fig, filename="models_comparison_errors", auto_open=False)
total_error=66.90350877192984%
| track_genre | total | success | success_rf | error | error_rf | |
|---|---|---|---|---|---|---|
| 42 | grindcore | 226 | 197 | 203 | 12.831858 | 10.176991 |
| 18 | comedy | 208 | 175 | 178 | 15.865385 | 14.423077 |
| 101 | sleep | 199 | 167 | 178 | 16.080402 | 10.552764 |
| 52 | honky-tonk | 227 | 185 | 181 | 18.502203 | 20.264317 |
| 105 | study | 208 | 165 | 178 | 20.673077 | 14.423077 |
| ... | ... | ... | ... | ... | ... | ... |
| 106 | swedish | 206 | 8 | 33 | 96.116505 | 83.980583 |
| 11 | british | 176 | 3 | 16 | 98.295455 | 90.909091 |
| 56 | indie | 203 | 3 | 9 | 98.522167 | 95.566502 |
| 3 | alternative | 210 | 3 | 21 | 98.571429 | 90.000000 |
| 89 | reggaeton | 199 | 0 | 7 | 100.000000 | 96.482412 |
114 rows × 6 columns
'https://plotly.com/~jcf94/19/'
The error is a little lower than RF model, but no very much (error is around 1% reduced). The error is higher than in the article mentioned, but the datasets used are fairy different (the one used in this analysis has around 1M rows, with 114 genres equally distributed, whereas the one used in the article has 42k rows and 15 genres, and some are later grouped, so the total amount of classes is even lower).
Most genres have a similar error, but there are some where RF performs notably worse (for example, for "latino" genre, RF error is more than 90%, whereas the NN model has around 60% error for this class. Rock, metal and hiphop have also around 20% more error in RF model).
It's interesting to note that the same model architecture as the article, without further tuning to this particular situation, performs at least well enough as the RF model (only marginally better). Tuning the NN model for this particular dataset distribution could improve the performance, but this shows the importance of reviewing the existent literature in a problem, as maybe someone has already worked on it (or a variation), and this knowledge can help the analysis or even answer the questions asked (in this case, maybe we could have started with the NN model directly and spend more time in tuning it to see if it improves performance).
In conclusion, both NN and RF models are similar in performance. They have a similar total error, and a similar distribution across genres, but NN performs better in some particular classes. RF model would be the final choice between the two, as it takes less time to train and does not need as much preprocessing. For further analysis, the NN model could be fine-tuned to see if it improves, or a mixed model could be made using both models.
confusion = metrics.confusion_matrix(y_pred, y_test)
confusion = pd.DataFrame(confusion, index=pd.unique(y_pred), columns=pd.unique(y_pred))
confusion.to_csv("confusion_nn.csv")